Use this to keep track of useful code bits as I learn Python Krista, August 19, 2015
Shortcut Action Shift-Enter run cell Ctrl-Enter run cell in-place Alt-Enter run cell, insert below
Ctrl / (Ctrl and then the slash)...will comment out any selected text within a block of code
In [ ]:
#First up...list the files in a directory
import os,sys
os.listdir(os.getcwd())
In [ ]:
#read the CSV file into a data frame and use the pandas head tool to show me the first five rows.
#note that this doesn't seem to work: pd.head(CO_RawData)
CO_RawData=pd.read_csv(mtabFile, index_col='RInumber')
CO_RawData.head(n=5)
In [ ]:
#insert an image...the gif file here would be in the folder
from IPython.display import Image
Image(url="R02485.gif")
In [ ]:
for x in range(0, 3):
print("hello")
In [ ]:
fig.suptitle(CO + ' working') #use the plus sign to concatenate strings for the title
In [ ]:
from IPython.core.debugger import Tracer #used this to step into the function and debug it, also need line with Tracer()()
for i, CO in enumerate(CO_withKO):
#if i==2:
#break
kos=CO_withKO[CO]['Related KO']
cos=CO_withKO[CO]['Related CO']
for k in kos:
if k in KO_RawData.index:
kData=KO_RawData.loc[kos].dropna()
kData=(kData.T/kData.sum(axis=1)).T
cData=CO_RawData.loc[cos].dropna()
cData=(cData.T/cData.sum(axis=1)).T
fig, ax=plt.subplots(1)
kData.T.plot(color='r', ax=ax)
cData.T.plot(color='k', ax=ax)
Tracer()()
getKmeans = CcoClust.loc['C01909']['kmeans']
makeStringLabel = CO + '_kmeansCluster_' + str(getKmeans)
#fig.suptitle(CO)
fig.suptitle(makeStringLabel)
#fig.savefig(CO+'.png') #stop saving all the images for now...
break
In [7]:
#here, tData is a pandas data frame that I want to plot into a bar graph
#tData.plot(kind = "bar") ##this would be the code to run if tData existed...
#instead I am reading in the file saved and present in my working directory using this:
from IPython.display import Image
Image(filename="SampleBarGraph.png")
Out[7]:
In [ ]:
#indexing in Python is a bit bizarre, or at least takes some getting used to.
# df.ix[0,'cNumber'] #this will allow me to mix index from integers with index by label
#other way apparently uses iloc and loc, to use integers and labels respectively
# this would be df.iloc[0].loc['cNumber] {can't get that to work in the if statement}
In [8]:
#ways to subset data...
CcoClust.loc['C05356']['kmeans']
tData = CcoClust.loc['C05356']
type(tData)
#want to select only the first group in the kmeans clusters
#(baby steps, eventually do this for each cluster)
CcoClust[CcoClust.kmeans==1]
/...this is where I learned to not use pip install with scikit-learn... To upgrade scikit-learn: conda update scikit-learn
In [1]:
import sklearn.cluster
#from sklearn.cluster import KMeans
In [1]:
silAverage = [0.4227, 0.33299, 0.354, 0.3768, 0.3362, 0.3014, 0.3041, 0.307, 0.313, 0.325,
0.3109, 0.2999, 0.293, 0.289, 0.2938, 0.29, 0.288, 0.3, 0.287]
In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
OK...can I get a simple scatter plot?
In [15]:
plt.scatter(range(0,len(silAverage)), silAverage)
plt.grid() #put on a grid
plt.xlim(-1,20)
Out[15]:
In [ ]:
#get list of column names in pandas data frame
list(my_dataframe.columns.values)
In [ ]:
for i in range(0,len(ut)):
if i == 10:
break
p = ut.iloc[i,:]
n = p.name
if n[0] == 'R':
#do the plotting,
#print 'yes'
CO = p.KEGG
kos = CO_withKO[CO]['Related KO']
cos = CO_withKO[CO]['Related CO']
#Tracer()()
for k in kos:
if k in KO_RawData.index:
kData=KO_RawData.loc[kos].dropna()
kData=(kData.T/kData.sum(axis=1)).T
#? why RawData, the output from the K-means will have the normalized data, use that for CO
#bc easier since that is the file I am working with right now.
#cData=CO_RawData.loc[cos].dropna()
#cData=(cData.T/cData.sum(axis=1)).T
cData = pd.DataFrame(p[dayList]).T
#go back and check, but I think this next step is already done
#cData=(cData.T/cData.sum(axis=1)).T
fig, ax=plt.subplots(1)
kData.T.plot(color='r', ax=ax)
cData.T.plot(color='k', ax=ax)
else:
#skip over the KO plotting, so effectively doing nothing
#print 'no'
Write a function to match RI number and cNumbers
In [ ]:
def findRInumber(dataIn,KEGGin):
#find possible RI numbers for a given KEGG number.
for i,KEGG in enumerate(dataIn['KEGG']):
if KEGG == KEGGin:
t = dataIn.index[i]
print t
#For example: this will give back one row, C18028 will be multiple
m = findRInumber(forRelatedness,'C00031')
m
In [ ]:
#to copy a matrix I would think this works: NOPE
#forRelatedness = CcoClust# this is NOT making a new copy...
#instead it makes a new pointing to an existing data frame. So you now have two ways to
#reference the same data frame. Make a change with one term and you can see the same change
#using the other name. Odd. No idea why you would want that.
In [ ]:
##this is the test that finally let me understand enumerate
# for index, KEGG in enumerate(useSmall['KEGG']):
# print index,KEGG
In [ ]:
# Windows
chrome_path = 'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe %s'
url = "http://www.genome.jp/dbget-bin/www_bget?cpd:C00019"
webbrowser.get(chrome_path).open_new(url)
#while a nice idea, this stays open until you close the web browser window.
In [1]:
from IPython.display import HTML
tList = ['C02265','C00001']
for i in tList:
ml = '<iframe src = http://www.genome.jp/dbget-bin/www_bget?cpd:' + i + ' width=700 height=350></iframe>'
print ml
In [3]:
from IPython.display import HTML
CO='C02265'
HTML('<iframe src = http://www.genome.jp/dbget-bin/www_bget?cpd:' + CO + ' width=700 height=350></iframe>')
Out[3]:
In [ ]: